In [ ]:
import tweepy
import sys
import jsonpickle
import os
In [ ]:
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
# API and ACCESS KEYS
API_KEY = '<TWITTER_API_KEY>' # Change me
API_SECRET = '<TWITTER_API_SECRET>' # Change me
searchQuery = 'bash OR bashbleed OR shellshock OR cve-2014-6271'
maxTweets = 500000
tweetsPerQry = 100 #Max Allowed per Query
fName = 'shellshockTweets.txt'
In [ ]:
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
# We make the client wait in case we exceed our rate of 450 queries / 15 min.
# So this can take a while before all tweets are fetched.
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
if (not api):
print ("Can't Authenticate Check Creds!")
sys.exit(-1)
last_id = -1
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
while tweetCount < maxTweets:
try:
if (last_id <= 0):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(last_id - 1))
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
last_id = new_tweets[-1].id
except tweepy.TweepError as e:
print("some error : " + str(e))
break
f.close()
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
POST ACTION
After file is downloaded gzip it using gzip shellshockTweets.txt
TODO Automate the gzipping part